Step 1. Pre-Processing our data!¶
# Got to load in our dependencies!
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from pandas.api.types import is_numeric_dtype
from great_tables import GT, md, html, system_fonts, style, loc
# First, we can load in our dataframes, starting with the overall look at all of the listings within the city and its general information for 2024
nov_listings: pd.DataFrame = pd.read_csv('./datasets/new_york_listings.csv')
# For historical reasons, let's also load in the same listings dataset but from July 2023
jul_23_listings: pd.DataFrame = pd.read_csv('./datasets/NYC-Airbnb-2023.csv')
C:\Users\ernie\AppData\Local\Temp\ipykernel_84516\4071149682.py:5: DtypeWarning: Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.
nov_listings.head(3)
| id | listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | https://www.airbnb.com/rooms/2595 | 20241104040953 | 2024-11-04 | city scrape | Skylit Midtown Castle Sanctuary | Beautiful, spacious skylit studio in the heart... | Centrally located in the heart of Manhattan ju... | https://a0.muscache.com/pictures/miso/Hosting-... | 2845 | ... | 4.8 | 4.81 | 4.40 | NaN | f | 3 | 3 | 0 | 0 | 0.27 |
| 1 | 6848 | https://www.airbnb.com/rooms/6848 | 20241104040953 | 2024-11-04 | city scrape | Only 2 stops to Manhattan studio | Comfortable studio apartment with super comfor... | NaN | https://a0.muscache.com/pictures/e4f031a7-f146... | 15991 | ... | 4.8 | 4.69 | 4.58 | NaN | f | 1 | 1 | 0 | 0 | 1.04 |
| 2 | 6872 | https://www.airbnb.com/rooms/6872 | 20241104040953 | 2024-11-04 | city scrape | Uptown Sanctuary w/ Private Bath (Month to Month) | This charming distancing-friendly month-to-mon... | This sweet Harlem sanctuary is a 10-20 minute ... | https://a0.muscache.com/pictures/miso/Hosting-... | 16104 | ... | 5.0 | 5.00 | 5.00 | NaN | f | 2 | 0 | 2 | 0 | 0.03 |
3 rows Ć 75 columns
jul_23_listings.head(3)
| id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2595 | Skylit Midtown Castle | 2845 | Jennifer | Manhattan | Midtown | 40.75356 | -73.98559 | Entire home/apt | 150 | 30 | 49 | 2022-06-21 | 0.30 | 3 | 314 | 1 | NaN |
| 1 | 5121 | BlissArtsSpace! | 7356 | Garon | Brooklyn | Bedford-Stuyvesant | 40.68535 | -73.95512 | Private room | 60 | 30 | 50 | 2019-12-02 | 0.30 | 2 | 365 | 0 | NaN |
| 2 | 5203 | Cozy Clean Guest Room - Family Apt | 7490 | MaryEllen | Manhattan | Upper West Side | 40.80380 | -73.96751 | Private room | 75 | 2 | 118 | 2017-07-21 | 0.72 | 1 | 0 | 0 | NaN |
# I like to do this because sometimes the columns have whitespace or weird capitalization you don't even realize
nov_listings.columns = np.vectorize(lambda x: x.strip().lower())(nov_listings.columns)
# Our first pre-processing step can just be dropping columns we definitely, 100% don't need for our analysis
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings.drop(
columns=['picture_url',
'host_url',
'neighbourhood', #Not really the neighborhood
'host_thumbnail_url',
'host_picture_url',
'host_has_profile_pic',
'host_identity_verified',
'license',
],
inplace = True
)
# Take the dollar sign out of the price column so we can do some analysis with it
nov_listings['price'] = nov_listings["price"].apply(lambda x: float(x.replace('$', '').replace(',','') if isinstance(x, str) else x))
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings currently has 75 columns nov_listings currently has 67 columns
# Can do the same for the 2023 listings, although there is a lot less data
jul_23_listings.columns = np.vectorize(lambda x: x.strip().lower())(jul_23_listings.columns)
print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings.drop(
columns=['license', 'number_of_reviews_ltm'],
inplace=True)
print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings currently has 18 columns jul_23_listings currently has 16 columns
# Now, let's take a look at NaN values for each dataframe
print(f'The number of NaN values per column in nov_listings: \n{nov_listings.isna().sum().sort_values(ascending=False)[:11]}')
print(f'The number of NaN values per column in jul_23_listings: \n{jul_23_listings.isna().sum().sort_values(ascending=False)}')
# That's helpful, but doesn't really give me a gauge on what percentage of rows this is, so let's calculate that!
nov_nan_percentages: pd.DataFrame = pd.DataFrame((nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])
nov_nan_table: GT = (
GT(nov_nan_percentages[:10])
.tab_header(
title = html("<span style='font-size:20px; font-weight:bold;'>Missing values for the November 2024 Table</span>"),
subtitle = html("<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"),
)
.tab_options(
table_font_names=system_fonts("industrial")
)
.data_color(
columns=['Missing Value Percentage'],
palette = 'RdPu',
)
)
jul_nan_percentages: pd.DataFrame = pd.DataFrame((jul_23_listings.isna().sum().sort_values(ascending=False)[:20] / len(jul_23_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])
jul_nan_table: GT = (
GT(jul_nan_percentages[:10])
.tab_header(
title = html("<span style='font-size:20px; font-weight:bold;'>Missing values for the July 2023 Table</span>"),
subtitle = html("<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"),
)
.tab_options(
table_font_names=system_fonts("industrial"),
)
.data_color(
columns=['Missing Value Percentage'],
palette = 'RdPu',
)
)
nov_nan_table.show()
jul_nan_table.show()
The number of NaN values per column in nov_listings: neighborhood_overview 16974 host_about 16224 host_response_time 15001 host_response_rate 15001 host_acceptance_rate 14983 last_review 11560 first_review 11560 host_location 7999 host_neighbourhood 7503 has_availability 5367 description 1044 dtype: int64 The number of NaN values per column in jul_23_listings: last_review 10304 reviews_per_month 10304 name 12 host_name 5 neighbourhood_group 0 neighbourhood 0 id 0 host_id 0 longitude 0 latitude 0 room_type 0 price 0 number_of_reviews 0 minimum_nights 0 calculated_host_listings_count 0 availability_365 0 dtype: int64
| Missing values for the November 2024 Table | |
| Sorted by the Percentage of the Column Missing | |
| Column Name | Missing Value Percentage |
|---|---|
| neighborhood_overview | 45.21 |
| host_about | 43.21 |
| host_response_time | 39.95 |
| host_response_rate | 39.95 |
| host_acceptance_rate | 39.9 |
| last_review | 30.79 |
| first_review | 30.79 |
| host_location | 21.3 |
| host_neighbourhood | 19.98 |
| has_availability | 14.29 |
| Missing values for the July 2023 Table | |
| Sorted by the Percentage of the Column Missing | |
| Column Name | Missing Value Percentage |
|---|---|
| last_review | 24.0 |
| reviews_per_month | 24.0 |
| name | 0.03 |
| host_name | 0.01 |
| neighbourhood_group | 0.0 |
| neighbourhood | 0.0 |
| id | 0.0 |
| host_id | 0.0 |
| longitude | 0.0 |
| latitude | 0.0 |
This creates an interesting dilemma; we can definitely drop calendar_updated, but what about the columns that have a noticeable proportion of their values missing? We can fill them in based upon the median, that is pretty easy, but I wanted to take a different approach given that I am taking a geography-centered point of view for this project: fill them based upon the median for that column within their borough. I think this can create a more accurate view without getting so specific that we are filling them based upon similar values in their neighborhood (which might have only a handful of values).
# First, let's get rid of that pesky calendar_updated column
nov_listings.drop(columns=['calendar_updated'], inplace=True)
# Let's get all the numerical columns with more than 30% of their values missing
missing_columns = [name for name, val in (nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).items() if val > .3 and is_numeric_dtype(nov_listings[name])]
# Create a function that can replace values in a column based for each borough
def fill_na_with_group_means(df: pd.DataFrame, col: str, group_col: str = 'neighbourhood_group_cleansed') -> pd.Series:
""" Returns a dictionary with the median for the grouped column that can be used to fill NaN values
Args:
df (pd.DataFrame): dataframe to utilize
col (str): column to take the median of
group_col (str, optional): column to group by Defaults to 'neighbourhood_group_cleansed'.
Returns:
pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
"""
# print(df.groupby(group_col)[col].transform('median'))
return df[col].fillna(df.groupby(group_col)[col].transform('median'))
# Do it for every missing column
for col in missing_columns:
nov_listings[col] = fill_na_with_group_means(nov_listings, col)
From here, we would typically do the same for jul_23_listings, but based upon the analysis above, there aren't many important columns for null values at all, so we can leave that.
Step 2. Visualizations¶
# Set plotly to offline mode so we can display these visualizations
pyo.init_notebook_mode()
# For our first visualization, it might be helpful to look at for each borough how the number of listings have changed from 2023 to now
num_nov_of_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed').size()
num_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group').size()
total_diff: float = ((len(nov_listings) - len(jul_23_listings)) / len(jul_23_listings)) * -100
listings_change: pd.DataFrame = pd.DataFrame({
'borough': num_nov_of_listings.index,
'July 2023': num_jul_23_listings,
'November 2024': num_nov_of_listings
})
listings_change['percent_change'] = ((listings_change['November 2024'] - listings_change['July 2023']) / listings_change['July 2023']) * 100
listings_change_fig = px.bar(
data_frame = listings_change,
x = 'borough',
template='plotly_dark',
y = ['July 2023', 'November 2024'],
barmode='group',
labels = {'borough': 'Borough', 'variable': 'Month'},
color_discrete_map={'July 2023': '#a1c9f4', 'November 2024': '#8de5a1'}
)
# Update layout
listings_change_fig.update_layout(
yaxis = dict(title=dict(text='Number of Airbnb Listings')),
font_family = "Raleway, sans-serif",
title = dict(text=f'<b>How Local Law 18 changed the number of Airbnbs across NYC boroughs</b><br><sup>The number across the city decreased by {total_diff:.2f}% but that decrease varied across boroughs</sup>'),
legend = dict (
x = .5,
y = -.3,
orientation = 'h',
yanchor = 'bottom',
xanchor = 'center'
)
)
# Add percent change information above each bar
for i, row in listings_change.iterrows():
listings_change_fig.add_trace(go.Scatter(
x=[row['borough']],
y = [row['July 2023']],
text=[f"{row['percent_change']:.2f}%", f"{row['percent_change']:.2f}%"],
mode="text",
showlegend=False,
textfont = dict(weight=600)
))
# Have to do this weird thing where we save the image and then display it within the notebook because plotly graphs mess up my blog lol
listings_change_fig.write_image("../img/listings_change.png", scale=6, engine="kaleido")
